import pandas as pd
df = pd.read_csv('Reviews.csv')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px
# Product Scores
fig = px.histogram(df, x="Score")
fig.update_traces(marker_color="turquoise",marker_line_color='rgb(8,48,107)',
marker_line_width=1.5)
fig.update_layout(title_text='Product Score')
fig.show()
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
chachedWords = stopwords.words('english')
from wordcloud import WordCloud
# Create stopword list:
stopwords = set(stopwords.words('english'))
stopwords.update(["br", "href"])
textt = " ".join(review for review in df.Text)
wordcloud = WordCloud(stopwords=stopwords).generate(textt)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('wordcloud11.png')
plt.show()
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\Humberto\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
df['sentiment'] = df['Score'].apply(lambda rating : +1 if rating > 3 else -1)
df
| Id | ProductId | UserId | ProfileName | HelpfulnessNumerator | HelpfulnessDenominator | Score | Time | Summary | Text | sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | B001E4KFG0 | A3SGXH7AUHU8GW | delmartian | 1 | 1 | 5 | 1303862400 | Good Quality Dog Food | I have bought several of the Vitality canned d... | 1 |
| 1 | 2 | B00813GRG4 | A1D87F6ZCVE5NK | dll pa | 0 | 0 | 1 | 1346976000 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... | -1 |
| 2 | 3 | B000LQOCH0 | ABXLMWJIXXAIN | Natalia Corres "Natalia Corres" | 1 | 1 | 4 | 1219017600 | "Delight" says it all | This is a confection that has been around a fe... | 1 |
| 3 | 4 | B000UA0QIQ | A395BORC6FGVXV | Karl | 3 | 3 | 2 | 1307923200 | Cough Medicine | If you are looking for the secret ingredient i... | -1 |
| 4 | 5 | B006K2ZZ7K | A1UQRSCLF8GW1T | Michael D. Bigham "M. Wassir" | 0 | 0 | 5 | 1350777600 | Great taffy | Great taffy at a great price. There was a wid... | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 568449 | 568450 | B001EO7N10 | A28KG5XORO54AY | Lettie D. Carter | 0 | 0 | 5 | 1299628800 | Will not do without | Great for sesame chicken..this is a good if no... | 1 |
| 568450 | 568451 | B003S1WTCU | A3I8AFVPEE8KI5 | R. Sawyer | 0 | 0 | 2 | 1331251200 | disappointed | I'm disappointed with the flavor. The chocolat... | -1 |
| 568451 | 568452 | B004I613EE | A121AA1GQV751Z | pksd "pk_007" | 2 | 2 | 5 | 1329782400 | Perfect for our maltipoo | These stars are small, so you can give 10-15 o... | 1 |
| 568452 | 568453 | B004I613EE | A3IBEVCTXKNOH | Kathy A. Welch "katwel" | 1 | 1 | 5 | 1331596800 | Favorite Training and reward treat | These are the BEST treats for training and rew... | 1 |
| 568453 | 568454 | B001LR2CU2 | A3LGQPJCZVL9UC | srfell17 | 0 | 0 | 5 | 1338422400 | Great Honey | I am very satisfied ,product is as advertised,... | 1 |
568454 rows × 11 columns
positive = df[df['sentiment'] == 1]
negative = df[df['sentiment'] == -1]
stopwords = set(stopwords)
stopwords.update(["br", "href","good","great"])
pos = " ".join(review for review in positive.Summary)
wordcloud2 = WordCloud(stopwords=stopwords).generate(pos)
plt.imshow(wordcloud2, interpolation='bilinear')
plt.axis("off")
plt.show()
negative['Summary'] = negative['Summary'].apply(str)
<ipython-input-16-d41b350b8f8f>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
neg = " ".join(review for review in negative.Summary)
wordcloud3 = WordCloud(stopwords=stopwords).generate(neg)
plt.imshow(wordcloud3, interpolation='bilinear')
plt.axis("off")
plt.savefig('wordcloud33.png')
plt.show()
df['sentimentt'] = df['sentiment'].replace({-1 : 'negative'})
df['sentimentt'] = df['sentimentt'].replace({1 : 'positive'})
fig = px.histogram(df, x="sentimentt")
fig.update_traces(marker_color="indianred",marker_line_color='rgb(8,48,107)',
marker_line_width=1.5)
fig.update_layout(title_text='Product Sentiment')
fig.show()
def remove_punctuation(text):
final = "".join(u for u in text if u not in ("?", ".", ";", ":", "!",'"'))
return final
df['Text'] = df['Text'].apply(remove_punctuation)
df = df.dropna(subset=['Summary'])
df['Summary'] = df['Summary'].apply(remove_punctuation)
<ipython-input-22-903d5d021ed6>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
dfNew = df[['Summary','sentiment']]
dfNew
| Summary | sentiment | |
|---|---|---|
| 0 | Good Quality Dog Food | 1 |
| 1 | Not as Advertised | -1 |
| 2 | Delight says it all | 1 |
| 3 | Cough Medicine | -1 |
| 4 | Great taffy | 1 |
| ... | ... | ... |
| 568449 | Will not do without | 1 |
| 568450 | disappointed | -1 |
| 568451 | Perfect for our maltipoo | 1 |
| 568452 | Favorite Training and reward treat | 1 |
| 568453 | Great Honey | 1 |
568427 rows × 2 columns
index = df.index
df['random_number'] = np.random.randn(len(index))
train = df[df['random_number'] <= 0.8]
test = df[df['random_number'] > 0.8]
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['Summary'].values.astype('U'))
test_matrix = vectorizer.transform(test['Summary'].values.astype('U'))
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
X_train = train_matrix
X_test = test_matrix
train_matrix
<447831x33008 sparse matrix of type '<class 'numpy.int64'>' with 1836177 stored elements in Compressed Sparse Row format>
y_train = train['sentiment']
y_test = test['sentiment']
lr.fit(X_train,y_train)
C:\Users\Humberto\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
LogisticRegression()
predictions = lr.predict(X_test)
from sklearn.metrics import confusion_matrix,classification_report
new = np.asarray(y_test)
confusion_matrix(predictions,y_test)
array([[17307, 3918],
[ 9151, 90220]], dtype=int64)
print(classification_report(predictions,y_test))
precision recall f1-score support
-1 0.65 0.82 0.73 21225
1 0.96 0.91 0.93 99371
accuracy 0.89 120596
macro avg 0.81 0.86 0.83 120596
weighted avg 0.90 0.89 0.90 120596